In [1]:
pip install shap
Requirement already satisfied: shap in c:\users\chand\anaconda3\lib\site-packages (0.46.0)
Requirement already satisfied: numpy in c:\users\chand\anaconda3\lib\site-packages (from shap) (1.26.4)
Requirement already satisfied: scipy in c:\users\chand\anaconda3\lib\site-packages (from shap) (1.11.4)
Requirement already satisfied: scikit-learn in c:\users\chand\anaconda3\lib\site-packages (from shap) (1.5.1)
Requirement already satisfied: pandas in c:\users\chand\anaconda3\lib\site-packages (from shap) (2.1.4)
Requirement already satisfied: tqdm>=4.27.0 in c:\users\chand\anaconda3\lib\site-packages (from shap) (4.65.0)
Requirement already satisfied: packaging>20.9 in c:\users\chand\anaconda3\lib\site-packages (from shap) (23.1)
Requirement already satisfied: slicer==0.0.8 in c:\users\chand\anaconda3\lib\site-packages (from shap) (0.0.8)
Requirement already satisfied: numba in c:\users\chand\anaconda3\lib\site-packages (from shap) (0.59.0)
Requirement already satisfied: cloudpickle in c:\users\chand\anaconda3\lib\site-packages (from shap) (2.2.1)
Requirement already satisfied: colorama in c:\users\chand\anaconda3\lib\site-packages (from tqdm>=4.27.0->shap) (0.4.6)
Requirement already satisfied: llvmlite<0.43,>=0.42.0dev0 in c:\users\chand\anaconda3\lib\site-packages (from numba->shap) (0.42.0)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\chand\anaconda3\lib\site-packages (from pandas->shap) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas->shap) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas->shap) (2023.3)
Requirement already satisfied: joblib>=1.2.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn->shap) (1.2.0)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn->shap) (3.5.0)
Requirement already satisfied: six>=1.5 in c:\users\chand\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas->shap) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
In [2]:
pip install interpret
Requirement already satisfied: interpret in c:\users\chand\anaconda3\lib\site-packages (0.6.3)
Requirement already satisfied: interpret-core==0.6.3 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.6.3)
Requirement already satisfied: numpy>=1.11.1 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.26.4)
Requirement already satisfied: scipy>=0.18.1 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.11.4)
Requirement already satisfied: pandas>=0.19.2 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.1.4)
Requirement already satisfied: scikit-learn>=0.18.1 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.5.1)
Requirement already satisfied: joblib>=0.11 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.2.0)
Requirement already satisfied: aplr>=10.5.1 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (10.6.0)
Requirement already satisfied: dash>=1.0.0 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.17.1)
Requirement already satisfied: dash-core-components>=1.0.0 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.0.0)
Requirement already satisfied: dash-html-components>=1.0.0 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.0.0)
Requirement already satisfied: dash-table>=4.1.0 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (5.0.0)
Requirement already satisfied: dash-cytoscape>=0.1.1 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.0.2)
Requirement already satisfied: gevent>=1.3.6 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (24.2.1)
Requirement already satisfied: requests>=2.19.0 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.31.0)
Requirement already satisfied: psutil>=5.6.2 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (5.9.0)
Requirement already satisfied: ipykernel>=4.10.0 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (6.28.0)
Requirement already satisfied: ipython>=5.5.0 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (8.20.0)
Requirement already satisfied: plotly>=3.8.1 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (5.9.0)
Requirement already satisfied: SALib>=1.3.3 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.5.0)
Requirement already satisfied: shap>=0.28.5 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.46.0)
Requirement already satisfied: dill>=0.2.5 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.3.8)
Requirement already satisfied: Flask<3.1,>=1.0.4 in c:\users\chand\anaconda3\lib\site-packages (from dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.2.5)
Requirement already satisfied: Werkzeug<3.1 in c:\users\chand\anaconda3\lib\site-packages (from dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.2.3)
Requirement already satisfied: importlib-metadata in c:\users\chand\anaconda3\lib\site-packages (from dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (7.0.1)
Requirement already satisfied: typing-extensions>=4.1.1 in c:\users\chand\anaconda3\lib\site-packages (from dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (4.12.2)
Requirement already satisfied: retrying in c:\users\chand\anaconda3\lib\site-packages (from dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.3.4)
Requirement already satisfied: nest-asyncio in c:\users\chand\anaconda3\lib\site-packages (from dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.6.0)
Requirement already satisfied: setuptools in c:\users\chand\anaconda3\lib\site-packages (from dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (68.2.2)
Requirement already satisfied: zope.event in c:\users\chand\anaconda3\lib\site-packages (from gevent>=1.3.6->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (5.0)
Requirement already satisfied: zope.interface in c:\users\chand\anaconda3\lib\site-packages (from gevent>=1.3.6->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (5.4.0)
Requirement already satisfied: greenlet>=3.0rc3 in c:\users\chand\anaconda3\lib\site-packages (from gevent>=1.3.6->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (3.0.1)
Requirement already satisfied: cffi>=1.12.2 in c:\users\chand\anaconda3\lib\site-packages (from gevent>=1.3.6->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.16.0)
Requirement already satisfied: comm>=0.1.1 in c:\users\chand\anaconda3\lib\site-packages (from ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.2.2)
Requirement already satisfied: debugpy>=1.6.5 in c:\users\chand\anaconda3\lib\site-packages (from ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.6.7)
Requirement already satisfied: jupyter-client>=6.1.12 in c:\users\chand\anaconda3\lib\site-packages (from ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (8.6.0)
Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in c:\users\chand\anaconda3\lib\site-packages (from ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (5.5.0)
Requirement already satisfied: matplotlib-inline>=0.1 in c:\users\chand\anaconda3\lib\site-packages (from ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.1.6)
Requirement already satisfied: packaging in c:\users\chand\anaconda3\lib\site-packages (from ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (23.1)
Requirement already satisfied: pyzmq>=24 in c:\users\chand\anaconda3\lib\site-packages (from ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (25.1.2)
Requirement already satisfied: tornado>=6.1 in c:\users\chand\anaconda3\lib\site-packages (from ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (6.3.3)
Requirement already satisfied: traitlets>=5.4.0 in c:\users\chand\anaconda3\lib\site-packages (from ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (5.7.1)
Requirement already satisfied: decorator in c:\users\chand\anaconda3\lib\site-packages (from ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (5.1.1)
Requirement already satisfied: jedi>=0.16 in c:\users\chand\anaconda3\lib\site-packages (from ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.18.1)
Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in c:\users\chand\anaconda3\lib\site-packages (from ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (3.0.43)
Requirement already satisfied: pygments>=2.4.0 in c:\users\chand\anaconda3\lib\site-packages (from ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.15.1)
Requirement already satisfied: stack-data in c:\users\chand\anaconda3\lib\site-packages (from ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.2.0)
Requirement already satisfied: colorama in c:\users\chand\anaconda3\lib\site-packages (from ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.4.6)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\chand\anaconda3\lib\site-packages (from pandas>=0.19.2->interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas>=0.19.2->interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas>=0.19.2->interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2023.3)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\chand\anaconda3\lib\site-packages (from plotly>=3.8.1->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (8.2.2)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\chand\anaconda3\lib\site-packages (from requests>=2.19.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\chand\anaconda3\lib\site-packages (from requests>=2.19.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (3.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\chand\anaconda3\lib\site-packages (from requests>=2.19.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\chand\anaconda3\lib\site-packages (from requests>=2.19.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2024.7.4)
Requirement already satisfied: matplotlib>=3.5 in c:\users\chand\anaconda3\lib\site-packages (from SALib>=1.3.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (3.8.0)
Requirement already satisfied: multiprocess in c:\users\chand\anaconda3\lib\site-packages (from SALib>=1.3.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.70.16)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn>=0.18.1->interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (3.5.0)
Requirement already satisfied: tqdm>=4.27.0 in c:\users\chand\anaconda3\lib\site-packages (from shap>=0.28.5->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (4.65.0)
Requirement already satisfied: slicer==0.0.8 in c:\users\chand\anaconda3\lib\site-packages (from shap>=0.28.5->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.0.8)
Requirement already satisfied: numba in c:\users\chand\anaconda3\lib\site-packages (from shap>=0.28.5->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.59.0)
Requirement already satisfied: cloudpickle in c:\users\chand\anaconda3\lib\site-packages (from shap>=0.28.5->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.2.1)
Requirement already satisfied: pycparser in c:\users\chand\anaconda3\lib\site-packages (from cffi>=1.12.2->gevent>=1.3.6->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.21)
Requirement already satisfied: Jinja2>=3.0 in c:\users\chand\anaconda3\lib\site-packages (from Flask<3.1,>=1.0.4->dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (3.1.3)
Requirement already satisfied: itsdangerous>=2.0 in c:\users\chand\anaconda3\lib\site-packages (from Flask<3.1,>=1.0.4->dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.0.1)
Requirement already satisfied: click>=8.0 in c:\users\chand\anaconda3\lib\site-packages (from Flask<3.1,>=1.0.4->dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (8.1.7)
Requirement already satisfied: parso<0.9.0,>=0.8.0 in c:\users\chand\anaconda3\lib\site-packages (from jedi>=0.16->ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.8.3)
Requirement already satisfied: platformdirs>=2.5 in c:\users\chand\anaconda3\lib\site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (3.10.0)
Requirement already satisfied: pywin32>=300 in c:\users\chand\anaconda3\lib\site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (305.1)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib>=3.5->SALib>=1.3.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.2.0)
Requirement already satisfied: cycler>=0.10 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib>=3.5->SALib>=1.3.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib>=3.5->SALib>=1.3.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib>=3.5->SALib>=1.3.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.4.4)
Requirement already satisfied: pillow>=6.2.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib>=3.5->SALib>=1.3.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (10.2.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib>=3.5->SALib>=1.3.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (3.0.9)
Requirement already satisfied: wcwidth in c:\users\chand\anaconda3\lib\site-packages (from prompt-toolkit<3.1.0,>=3.0.41->ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.2.5)
Requirement already satisfied: six>=1.5 in c:\users\chand\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas>=0.19.2->interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.16.0)
Requirement already satisfied: MarkupSafe>=2.1.1 in c:\users\chand\anaconda3\lib\site-packages (from Werkzeug<3.1->dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.1.3)
Requirement already satisfied: zipp>=0.5 in c:\users\chand\anaconda3\lib\site-packages (from importlib-metadata->dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (3.17.0)
Requirement already satisfied: llvmlite<0.43,>=0.42.0dev0 in c:\users\chand\anaconda3\lib\site-packages (from numba->shap>=0.28.5->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.42.0)
Requirement already satisfied: executing in c:\users\chand\anaconda3\lib\site-packages (from stack-data->ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.8.3)
Requirement already satisfied: asttokens in c:\users\chand\anaconda3\lib\site-packages (from stack-data->ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.0.5)
Requirement already satisfied: pure-eval in c:\users\chand\anaconda3\lib\site-packages (from stack-data->ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.2.2)
Note: you may need to restart the kernel to use updated packages.
In [3]:
pip install catboost shap
Requirement already satisfied: catboost in c:\users\chand\anaconda3\lib\site-packages (1.2.5)Note: you may need to restart the kernel to use updated packages.

Requirement already satisfied: shap in c:\users\chand\anaconda3\lib\site-packages (0.46.0)
Requirement already satisfied: graphviz in c:\users\chand\anaconda3\lib\site-packages (from catboost) (0.20.3)
Requirement already satisfied: matplotlib in c:\users\chand\anaconda3\lib\site-packages (from catboost) (3.8.0)
Requirement already satisfied: numpy>=1.16.0 in c:\users\chand\anaconda3\lib\site-packages (from catboost) (1.26.4)
Requirement already satisfied: pandas>=0.24 in c:\users\chand\anaconda3\lib\site-packages (from catboost) (2.1.4)
Requirement already satisfied: scipy in c:\users\chand\anaconda3\lib\site-packages (from catboost) (1.11.4)
Requirement already satisfied: plotly in c:\users\chand\anaconda3\lib\site-packages (from catboost) (5.9.0)
Requirement already satisfied: six in c:\users\chand\anaconda3\lib\site-packages (from catboost) (1.16.0)
Requirement already satisfied: scikit-learn in c:\users\chand\anaconda3\lib\site-packages (from shap) (1.5.1)
Requirement already satisfied: tqdm>=4.27.0 in c:\users\chand\anaconda3\lib\site-packages (from shap) (4.65.0)
Requirement already satisfied: packaging>20.9 in c:\users\chand\anaconda3\lib\site-packages (from shap) (23.1)
Requirement already satisfied: slicer==0.0.8 in c:\users\chand\anaconda3\lib\site-packages (from shap) (0.0.8)
Requirement already satisfied: numba in c:\users\chand\anaconda3\lib\site-packages (from shap) (0.59.0)
Requirement already satisfied: cloudpickle in c:\users\chand\anaconda3\lib\site-packages (from shap) (2.2.1)
Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\chand\anaconda3\lib\site-packages (from pandas>=0.24->catboost) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas>=0.24->catboost) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas>=0.24->catboost) (2023.3)
Requirement already satisfied: colorama in c:\users\chand\anaconda3\lib\site-packages (from tqdm>=4.27.0->shap) (0.4.6)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->catboost) (1.2.0)
Requirement already satisfied: cycler>=0.10 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->catboost) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->catboost) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->catboost) (1.4.4)
Requirement already satisfied: pillow>=6.2.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->catboost) (10.2.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->catboost) (3.0.9)
Requirement already satisfied: llvmlite<0.43,>=0.42.0dev0 in c:\users\chand\anaconda3\lib\site-packages (from numba->shap) (0.42.0)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\chand\anaconda3\lib\site-packages (from plotly->catboost) (8.2.2)
Requirement already satisfied: joblib>=1.2.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn->shap) (1.2.0)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn->shap) (3.5.0)
In [4]:
pip install lime
Requirement already satisfied: lime in c:\users\chand\anaconda3\lib\site-packages (0.2.0.1)
Requirement already satisfied: matplotlib in c:\users\chand\anaconda3\lib\site-packages (from lime) (3.8.0)
Requirement already satisfied: numpy in c:\users\chand\anaconda3\lib\site-packages (from lime) (1.26.4)
Requirement already satisfied: scipy in c:\users\chand\anaconda3\lib\site-packages (from lime) (1.11.4)
Requirement already satisfied: tqdm in c:\users\chand\anaconda3\lib\site-packages (from lime) (4.65.0)
Requirement already satisfied: scikit-learn>=0.18 in c:\users\chand\anaconda3\lib\site-packages (from lime) (1.5.1)
Requirement already satisfied: scikit-image>=0.12 in c:\users\chand\anaconda3\lib\site-packages (from lime) (0.22.0)
Requirement already satisfied: networkx>=2.8 in c:\users\chand\anaconda3\lib\site-packages (from scikit-image>=0.12->lime) (3.1)
Requirement already satisfied: pillow>=9.0.1 in c:\users\chand\anaconda3\lib\site-packages (from scikit-image>=0.12->lime) (10.2.0)
Requirement already satisfied: imageio>=2.27 in c:\users\chand\anaconda3\lib\site-packages (from scikit-image>=0.12->lime) (2.33.1)
Requirement already satisfied: tifffile>=2022.8.12 in c:\users\chand\anaconda3\lib\site-packages (from scikit-image>=0.12->lime) (2023.4.12)
Requirement already satisfied: packaging>=21 in c:\users\chand\anaconda3\lib\site-packages (from scikit-image>=0.12->lime) (23.1)
Requirement already satisfied: lazy_loader>=0.3 in c:\users\chand\anaconda3\lib\site-packages (from scikit-image>=0.12->lime) (0.3)
Requirement already satisfied: joblib>=1.2.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn>=0.18->lime) (1.2.0)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn>=0.18->lime) (3.5.0)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->lime) (1.2.0)
Requirement already satisfied: cycler>=0.10 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->lime) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->lime) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->lime) (1.4.4)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->lime) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->lime) (2.8.2)
Requirement already satisfied: colorama in c:\users\chand\anaconda3\lib\site-packages (from tqdm->lime) (0.4.6)
Requirement already satisfied: six>=1.5 in c:\users\chand\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->lime) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
In [5]:
pip install imodels
Requirement already satisfied: imodels in c:\users\chand\anaconda3\lib\site-packages (1.4.6)
Requirement already satisfied: matplotlib in c:\users\chand\anaconda3\lib\site-packages (from imodels) (3.8.0)
Requirement already satisfied: mlxtend>=0.18.0 in c:\users\chand\anaconda3\lib\site-packages (from imodels) (0.23.1)
Requirement already satisfied: numpy in c:\users\chand\anaconda3\lib\site-packages (from imodels) (1.26.4)
Requirement already satisfied: pandas in c:\users\chand\anaconda3\lib\site-packages (from imodels) (2.1.4)
Requirement already satisfied: requests in c:\users\chand\anaconda3\lib\site-packages (from imodels) (2.31.0)
Requirement already satisfied: scipy in c:\users\chand\anaconda3\lib\site-packages (from imodels) (1.11.4)
Requirement already satisfied: scikit-learn>=1.2.0 in c:\users\chand\anaconda3\lib\site-packages (from imodels) (1.5.1)
Requirement already satisfied: tqdm in c:\users\chand\anaconda3\lib\site-packages (from imodels) (4.65.0)
Requirement already satisfied: joblib>=0.13.2 in c:\users\chand\anaconda3\lib\site-packages (from mlxtend>=0.18.0->imodels) (1.2.0)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->imodels) (1.2.0)
Requirement already satisfied: cycler>=0.10 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->imodels) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->imodels) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->imodels) (1.4.4)
Requirement already satisfied: packaging>=20.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->imodels) (23.1)
Requirement already satisfied: pillow>=6.2.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->imodels) (10.2.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->imodels) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->imodels) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas->imodels) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas->imodels) (2023.3)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn>=1.2.0->imodels) (3.5.0)
Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\chand\anaconda3\lib\site-packages (from requests->imodels) (2.0.4)
Requirement already satisfied: idna<4,>=2.5 in c:\users\chand\anaconda3\lib\site-packages (from requests->imodels) (3.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\chand\anaconda3\lib\site-packages (from requests->imodels) (2.0.7)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\chand\anaconda3\lib\site-packages (from requests->imodels) (2024.7.4)
Requirement already satisfied: colorama in c:\users\chand\anaconda3\lib\site-packages (from tqdm->imodels) (0.4.6)
Requirement already satisfied: six>=1.5 in c:\users\chand\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->imodels) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
In [6]:
pip install matplotlib shap scikit-learn
Requirement already satisfied: matplotlib in c:\users\chand\anaconda3\lib\site-packages (3.8.0)
Requirement already satisfied: shap in c:\users\chand\anaconda3\lib\site-packages (0.46.0)
Requirement already satisfied: scikit-learn in c:\users\chand\anaconda3\lib\site-packages (1.5.1)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (1.2.0)
Requirement already satisfied: cycler>=0.10 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (1.4.4)
Requirement already satisfied: numpy<2,>=1.21 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (1.26.4)
Requirement already satisfied: packaging>=20.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (23.1)
Requirement already satisfied: pillow>=6.2.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (10.2.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (2.8.2)
Requirement already satisfied: scipy in c:\users\chand\anaconda3\lib\site-packages (from shap) (1.11.4)
Requirement already satisfied: pandas in c:\users\chand\anaconda3\lib\site-packages (from shap) (2.1.4)
Requirement already satisfied: tqdm>=4.27.0 in c:\users\chand\anaconda3\lib\site-packages (from shap) (4.65.0)
Requirement already satisfied: slicer==0.0.8 in c:\users\chand\anaconda3\lib\site-packages (from shap) (0.0.8)
Requirement already satisfied: numba in c:\users\chand\anaconda3\lib\site-packages (from shap) (0.59.0)
Requirement already satisfied: cloudpickle in c:\users\chand\anaconda3\lib\site-packages (from shap) (2.2.1)
Requirement already satisfied: joblib>=1.2.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn) (1.2.0)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn) (3.5.0)
Requirement already satisfied: six>=1.5 in c:\users\chand\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)
Requirement already satisfied: colorama in c:\users\chand\anaconda3\lib\site-packages (from tqdm>=4.27.0->shap) (0.4.6)
Requirement already satisfied: llvmlite<0.43,>=0.42.0dev0 in c:\users\chand\anaconda3\lib\site-packages (from numba->shap) (0.42.0)
Requirement already satisfied: pytz>=2020.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas->shap) (2023.3.post1)
Requirement already satisfied: tzdata>=2022.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas->shap) (2023.3)
Note: you may need to restart the kernel to use updated packages.
In [7]:
pip install matplotlib scikit-learn
Requirement already satisfied: matplotlib in c:\users\chand\anaconda3\lib\site-packages (3.8.0)
Requirement already satisfied: scikit-learn in c:\users\chand\anaconda3\lib\site-packages (1.5.1)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (1.2.0)
Requirement already satisfied: cycler>=0.10 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (1.4.4)
Requirement already satisfied: numpy<2,>=1.21 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (1.26.4)
Requirement already satisfied: packaging>=20.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (23.1)
Requirement already satisfied: pillow>=6.2.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (10.2.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (2.8.2)
Requirement already satisfied: scipy>=1.6.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn) (1.11.4)
Requirement already satisfied: joblib>=1.2.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn) (1.2.0)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn) (3.5.0)
Requirement already satisfied: six>=1.5 in c:\users\chand\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
In [8]:
pip install -U scikit-learn
Requirement already satisfied: scikit-learn in c:\users\chand\anaconda3\lib\site-packages (1.5.1)
Requirement already satisfied: numpy>=1.19.5 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn) (1.26.4)
Requirement already satisfied: scipy>=1.6.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn) (1.11.4)
Requirement already satisfied: joblib>=1.2.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn) (1.2.0)
Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn) (3.5.0)
Note: you may need to restart the kernel to use updated packages.

Importing Libraries¶

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from collections import Counter
from scipy import stats
from scipy.stats import zscore
import warnings
warnings.filterwarnings('ignore')
sns.set()
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import roc_curve, auc, classification_report, accuracy_score, confusion_matrix
from sklearn.feature_selection import RFE
plt.style.use('ggplot')
from sklearn.decomposition import PCA
import shap
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, confusion_matrix, roc_curve, roc_auc_score
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import ConfusionMatrixDisplay

Basic Pre-Processing¶

#Loading the dataset¶

In [10]:
bc_data= pd.read_csv("C://Users/chand/Documents/Dissertation/Dataset/Breast_cancer.csv")
bc_data
Out[10]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
0 842302 M 17.99 10.38 122.80 1001.0 0.11840 0.27760 0.30010 0.14710 ... 25.380 17.33 184.60 2019.0 0.16220 0.66560 0.7119 0.2654 0.4601 0.11890
1 842517 M 20.57 17.77 132.90 1326.0 0.08474 0.07864 0.08690 0.07017 ... 24.990 23.41 158.80 1956.0 0.12380 0.18660 0.2416 0.1860 0.2750 0.08902
2 84300903 M 19.69 21.25 130.00 1203.0 0.10960 0.15990 0.19740 0.12790 ... 23.570 25.53 152.50 1709.0 0.14440 0.42450 0.4504 0.2430 0.3613 0.08758
3 84348301 M 11.42 20.38 77.58 386.1 0.14250 0.28390 0.24140 0.10520 ... 14.910 26.50 98.87 567.7 0.20980 0.86630 0.6869 0.2575 0.6638 0.17300
4 84358402 M 20.29 14.34 135.10 1297.0 0.10030 0.13280 0.19800 0.10430 ... 22.540 16.67 152.20 1575.0 0.13740 0.20500 0.4000 0.1625 0.2364 0.07678
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
564 926424 M 21.56 22.39 142.00 1479.0 0.11100 0.11590 0.24390 0.13890 ... 25.450 26.40 166.10 2027.0 0.14100 0.21130 0.4107 0.2216 0.2060 0.07115
565 926682 M 20.13 28.25 131.20 1261.0 0.09780 0.10340 0.14400 0.09791 ... 23.690 38.25 155.00 1731.0 0.11660 0.19220 0.3215 0.1628 0.2572 0.06637
566 926954 M 16.60 28.08 108.30 858.1 0.08455 0.10230 0.09251 0.05302 ... 18.980 34.12 126.70 1124.0 0.11390 0.30940 0.3403 0.1418 0.2218 0.07820
567 927241 M 20.60 29.33 140.10 1265.0 0.11780 0.27700 0.35140 0.15200 ... 25.740 39.42 184.60 1821.0 0.16500 0.86810 0.9387 0.2650 0.4087 0.12400
568 92751 B 7.76 24.54 47.92 181.0 0.05263 0.04362 0.00000 0.00000 ... 9.456 30.37 59.16 268.6 0.08996 0.06444 0.0000 0.0000 0.2871 0.07039

569 rows × 32 columns

In [11]:
bc_data.shape
Out[11]:
(569, 32)
In [12]:
bc_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             569 non-null    float64
 15  area_se                  569 non-null    float64
 16  smoothness_se            569 non-null    float64
 17  compactness_se           569 non-null    float64
 18  concavity_se             569 non-null    float64
 19  concave points_se        569 non-null    float64
 20  symmetry_se              569 non-null    float64
 21  fractal_dimension_se     569 non-null    float64
 22  radius_worst             569 non-null    float64
 23  texture_worst            569 non-null    float64
 24  perimeter_worst          569 non-null    float64
 25  area_worst               569 non-null    float64
 26  smoothness_worst         569 non-null    float64
 27  compactness_worst        569 non-null    float64
 28  concavity_worst          569 non-null    float64
 29  concave points_worst     569 non-null    float64
 30  symmetry_worst           569 non-null    float64
 31  fractal_dimension_worst  569 non-null    float64
dtypes: float64(30), int64(1), object(1)
memory usage: 142.4+ KB
In [13]:
# Convert the target variable 'diagnosis' column to numerical values

bc_data['diagnosis'] = bc_data['diagnosis'].map({'M': 1, 'B': 0})

Exploratory Data Analysis¶

#Summary statistics¶

In [14]:
bc_data.describe()
Out[14]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
count 5.690000e+02 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 ... 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000 569.000000
mean 3.037183e+07 0.372583 14.127292 19.289649 91.969033 654.889104 0.096360 0.104341 0.088799 0.048919 ... 16.269190 25.677223 107.261213 880.583128 0.132369 0.254265 0.272188 0.114606 0.290076 0.083946
std 1.250206e+08 0.483918 3.524049 4.301036 24.298981 351.914129 0.014064 0.052813 0.079720 0.038803 ... 4.833242 6.146258 33.602542 569.356993 0.022832 0.157336 0.208624 0.065732 0.061867 0.018061
min 8.670000e+03 0.000000 6.981000 9.710000 43.790000 143.500000 0.052630 0.019380 0.000000 0.000000 ... 7.930000 12.020000 50.410000 185.200000 0.071170 0.027290 0.000000 0.000000 0.156500 0.055040
25% 8.692180e+05 0.000000 11.700000 16.170000 75.170000 420.300000 0.086370 0.064920 0.029560 0.020310 ... 13.010000 21.080000 84.110000 515.300000 0.116600 0.147200 0.114500 0.064930 0.250400 0.071460
50% 9.060240e+05 0.000000 13.370000 18.840000 86.240000 551.100000 0.095870 0.092630 0.061540 0.033500 ... 14.970000 25.410000 97.660000 686.500000 0.131300 0.211900 0.226700 0.099930 0.282200 0.080040
75% 8.813129e+06 1.000000 15.780000 21.800000 104.100000 782.700000 0.105300 0.130400 0.130700 0.074000 ... 18.790000 29.720000 125.400000 1084.000000 0.146000 0.339100 0.382900 0.161400 0.317900 0.092080
max 9.113205e+08 1.000000 28.110000 39.280000 188.500000 2501.000000 0.163400 0.345400 0.426800 0.201200 ... 36.040000 49.540000 251.200000 4254.000000 0.222600 1.058000 1.252000 0.291000 0.663800 0.207500

8 rows × 32 columns

Distribution of Target Variable¶

In [15]:
plt.figure(figsize=(8, 6))
sns.countplot(x='diagnosis', data=bc_data, palette='viridis')
plt.title('Distribution of Diagnosis')
plt.xlabel('Diagnosis (0 = Benign, 1 = Malignant)')
plt.ylabel('Count')
plt.show()
No description has been provided for this image

Density Graph to check the trends of data¶

In [16]:
plt.figure(figsize=(20,15))
plotnumber = 1
for column in bc_data:
    if plotnumber<=30:
        ax = plt.subplot(5,6,plotnumber)
        sns.distplot(bc_data[column])
        plt.xlabel(column)
    plotnumber+=1

plt.tight_layout()
plt.show()
No description has been provided for this image

Correlation Analysis¶

In [17]:
bc_data.corr()
Out[17]:
id diagnosis radius_mean texture_mean perimeter_mean area_mean smoothness_mean compactness_mean concavity_mean concave points_mean ... radius_worst texture_worst perimeter_worst area_worst smoothness_worst compactness_worst concavity_worst concave points_worst symmetry_worst fractal_dimension_worst
id 1.000000 0.039769 0.074626 0.099770 0.073159 0.096893 -0.012968 0.000096 0.050080 0.044158 ... 0.082405 0.064720 0.079986 0.107187 0.010338 -0.002968 0.023203 0.035174 -0.044224 -0.029866
diagnosis 0.039769 1.000000 0.730029 0.415185 0.742636 0.708984 0.358560 0.596534 0.696360 0.776614 ... 0.776454 0.456903 0.782914 0.733825 0.421465 0.590998 0.659610 0.793566 0.416294 0.323872
radius_mean 0.074626 0.730029 1.000000 0.323782 0.997855 0.987357 0.170581 0.506124 0.676764 0.822529 ... 0.969539 0.297008 0.965137 0.941082 0.119616 0.413463 0.526911 0.744214 0.163953 0.007066
texture_mean 0.099770 0.415185 0.323782 1.000000 0.329533 0.321086 -0.023389 0.236702 0.302418 0.293464 ... 0.352573 0.912045 0.358040 0.343546 0.077503 0.277830 0.301025 0.295316 0.105008 0.119205
perimeter_mean 0.073159 0.742636 0.997855 0.329533 1.000000 0.986507 0.207278 0.556936 0.716136 0.850977 ... 0.969476 0.303038 0.970387 0.941550 0.150549 0.455774 0.563879 0.771241 0.189115 0.051019
area_mean 0.096893 0.708984 0.987357 0.321086 0.986507 1.000000 0.177028 0.498502 0.685983 0.823269 ... 0.962746 0.287489 0.959120 0.959213 0.123523 0.390410 0.512606 0.722017 0.143570 0.003738
smoothness_mean -0.012968 0.358560 0.170581 -0.023389 0.207278 0.177028 1.000000 0.659123 0.521984 0.553695 ... 0.213120 0.036072 0.238853 0.206718 0.805324 0.472468 0.434926 0.503053 0.394309 0.499316
compactness_mean 0.000096 0.596534 0.506124 0.236702 0.556936 0.498502 0.659123 1.000000 0.883121 0.831135 ... 0.535315 0.248133 0.590210 0.509604 0.565541 0.865809 0.816275 0.815573 0.510223 0.687382
concavity_mean 0.050080 0.696360 0.676764 0.302418 0.716136 0.685983 0.521984 0.883121 1.000000 0.921391 ... 0.688236 0.299879 0.729565 0.675987 0.448822 0.754968 0.884103 0.861323 0.409464 0.514930
concave points_mean 0.044158 0.776614 0.822529 0.293464 0.850977 0.823269 0.553695 0.831135 0.921391 1.000000 ... 0.830318 0.292752 0.855923 0.809630 0.452753 0.667454 0.752399 0.910155 0.375744 0.368661
symmetry_mean -0.022114 0.330499 0.147741 0.071401 0.183027 0.151293 0.557775 0.602641 0.500667 0.462497 ... 0.185728 0.090651 0.219169 0.177193 0.426675 0.473200 0.433721 0.430297 0.699826 0.438413
fractal_dimension_mean -0.052511 -0.012838 -0.311631 -0.076437 -0.261477 -0.283110 0.584792 0.565369 0.336783 0.166917 ... -0.253691 -0.051269 -0.205151 -0.231854 0.504942 0.458798 0.346234 0.175325 0.334019 0.767297
radius_se 0.143048 0.567134 0.679090 0.275869 0.691765 0.732562 0.301467 0.497473 0.631925 0.698050 ... 0.715065 0.194799 0.719684 0.751548 0.141919 0.287103 0.380585 0.531062 0.094543 0.049559
texture_se -0.007526 -0.008303 -0.097317 0.386358 -0.086761 -0.066280 0.068406 0.046205 0.076218 0.021480 ... -0.111690 0.409003 -0.102242 -0.083195 -0.073658 -0.092439 -0.068956 -0.119638 -0.128215 -0.045655
perimeter_se 0.137331 0.556141 0.674172 0.281673 0.693135 0.726628 0.296092 0.548905 0.660391 0.710650 ... 0.697201 0.200371 0.721031 0.730713 0.130054 0.341919 0.418899 0.554897 0.109930 0.085433
area_se 0.177742 0.548236 0.735864 0.259845 0.744983 0.800086 0.246552 0.455653 0.617427 0.690299 ... 0.757373 0.196497 0.761213 0.811408 0.125389 0.283257 0.385100 0.538166 0.074126 0.017539
smoothness_se 0.096781 -0.067016 -0.222600 0.006614 -0.202694 -0.166777 0.332375 0.135299 0.098564 0.027653 ... -0.230691 -0.074743 -0.217304 -0.182195 0.314457 -0.055558 -0.058298 -0.102007 -0.107342 0.101480
compactness_se 0.033961 0.292999 0.206000 0.191975 0.250744 0.212583 0.318943 0.738722 0.670279 0.490424 ... 0.204607 0.143003 0.260516 0.199371 0.227394 0.678780 0.639147 0.483208 0.277878 0.590973
concavity_se 0.055239 0.253730 0.194204 0.143293 0.228082 0.207660 0.248396 0.570517 0.691270 0.439167 ... 0.186904 0.100241 0.226680 0.188353 0.168481 0.484858 0.662564 0.440472 0.197788 0.439329
concave points_se 0.078768 0.408042 0.376169 0.163851 0.407217 0.372320 0.380676 0.642262 0.683260 0.615634 ... 0.358127 0.086741 0.394999 0.342271 0.215351 0.452888 0.549592 0.602450 0.143116 0.310655
symmetry_se -0.017306 -0.006522 -0.104321 0.009127 -0.081629 -0.072497 0.200774 0.229977 0.178009 0.095351 ... -0.128121 -0.077473 -0.103753 -0.110343 -0.012662 0.060255 0.037119 -0.030413 0.389402 0.078079
fractal_dimension_se 0.025725 0.077972 -0.042641 0.054458 -0.005523 -0.019887 0.283607 0.507318 0.449301 0.257584 ... -0.037488 -0.003195 -0.001000 -0.022736 0.170568 0.390159 0.379975 0.215204 0.111094 0.591328
radius_worst 0.082405 0.776454 0.969539 0.352573 0.969476 0.962746 0.213120 0.535315 0.688236 0.830318 ... 1.000000 0.359921 0.993708 0.984015 0.216574 0.475820 0.573975 0.787424 0.243529 0.093492
texture_worst 0.064720 0.456903 0.297008 0.912045 0.303038 0.287489 0.036072 0.248133 0.299879 0.292752 ... 0.359921 1.000000 0.365098 0.345842 0.225429 0.360832 0.368366 0.359755 0.233027 0.219122
perimeter_worst 0.079986 0.782914 0.965137 0.358040 0.970387 0.959120 0.238853 0.590210 0.729565 0.855923 ... 0.993708 0.365098 1.000000 0.977578 0.236775 0.529408 0.618344 0.816322 0.269493 0.138957
area_worst 0.107187 0.733825 0.941082 0.343546 0.941550 0.959213 0.206718 0.509604 0.675987 0.809630 ... 0.984015 0.345842 0.977578 1.000000 0.209145 0.438296 0.543331 0.747419 0.209146 0.079647
smoothness_worst 0.010338 0.421465 0.119616 0.077503 0.150549 0.123523 0.805324 0.565541 0.448822 0.452753 ... 0.216574 0.225429 0.236775 0.209145 1.000000 0.568187 0.518523 0.547691 0.493838 0.617624
compactness_worst -0.002968 0.590998 0.413463 0.277830 0.455774 0.390410 0.472468 0.865809 0.754968 0.667454 ... 0.475820 0.360832 0.529408 0.438296 0.568187 1.000000 0.892261 0.801080 0.614441 0.810455
concavity_worst 0.023203 0.659610 0.526911 0.301025 0.563879 0.512606 0.434926 0.816275 0.884103 0.752399 ... 0.573975 0.368366 0.618344 0.543331 0.518523 0.892261 1.000000 0.855434 0.532520 0.686511
concave points_worst 0.035174 0.793566 0.744214 0.295316 0.771241 0.722017 0.503053 0.815573 0.861323 0.910155 ... 0.787424 0.359755 0.816322 0.747419 0.547691 0.801080 0.855434 1.000000 0.502528 0.511114
symmetry_worst -0.044224 0.416294 0.163953 0.105008 0.189115 0.143570 0.394309 0.510223 0.409464 0.375744 ... 0.243529 0.233027 0.269493 0.209146 0.493838 0.614441 0.532520 0.502528 1.000000 0.537848
fractal_dimension_worst -0.029866 0.323872 0.007066 0.119205 0.051019 0.003738 0.499316 0.687382 0.514930 0.368661 ... 0.093492 0.219122 0.138957 0.079647 0.617624 0.810455 0.686511 0.511114 0.537848 1.000000

32 rows × 32 columns

In [18]:
#Heatmap

correlation_matrix = bc_data.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True, linewidths=.5)
plt.title('Correlation Matrix Heatmap')

# Show the plot
plt.show()
No description has been provided for this image

Handling Missing Values¶

In [19]:
missing_values = bc_data.isnull().sum()
missing_values
Out[19]:
id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

Data Cleaning¶

In [20]:
#Dropping Irrelevant columns

bc_data.drop('id', axis=1, inplace=True)
In [21]:
# Checking for Duplicates

duplicates = bc_data.duplicated().sum()
print(f'Number of duplicate rows: {duplicates}')
Number of duplicate rows: 0

Outlier Detection¶

In [22]:
X = bc_data.drop(columns=['diagnosis'])
In [23]:
# Boxplot before outlier removal
plt.figure(figsize=(16, 12))
sns.boxplot(data=X)
plt.xticks(rotation=90)
plt.title('Boxplot of Features Before Outlier Removal')

plt.savefig('boxplot_before_outlier_removal.png', dpi=300, bbox_inches='tight') 
plt.show()
No description has been provided for this image

Applying Isolation Forest Algorithm¶

In [24]:
iso_forest = IsolationForest(contamination=0.05, random_state=42)
outliers = iso_forest.fit_predict(X)
bc_data['outlier'] = outliers

# Separating the non-outliers
bc_data_cleaned = bc_data[bc_data['outlier'] != -1].drop(columns=['outlier'])
In [25]:
# Boxplot after outlier removal

X_no_outliers = bc_data_cleaned.drop(columns=['diagnosis'])
plt.figure(figsize=(16, 12))
sns.boxplot(data=X_no_outliers)
plt.xticks(rotation=90)
plt.title('Boxplot of Features After Outlier Removal')
plt.savefig('boxplot_after_outlier_removal.png', dpi=300, bbox_inches='tight') 
plt.show()
No description has been provided for this image
In [26]:
# cleaned dataset

print(bc_data_cleaned.head())
   diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
1          1        20.57         17.77          132.90     1326.0   
2          1        19.69         21.25          130.00     1203.0   
4          1        20.29         14.34          135.10     1297.0   
5          1        12.45         15.70           82.57      477.1   
6          1        18.25         19.98          119.60     1040.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
4          0.10030           0.13280          0.1980              0.10430   
5          0.12780           0.17000          0.1578              0.08089   
6          0.09463           0.10900          0.1127              0.07400   

   symmetry_mean  ...  radius_worst  texture_worst  perimeter_worst  \
1         0.1812  ...         24.99          23.41            158.8   
2         0.2069  ...         23.57          25.53            152.5   
4         0.1809  ...         22.54          16.67            152.2   
5         0.2087  ...         15.47          23.75            103.4   
6         0.1794  ...         22.88          27.66            153.2   

   area_worst  smoothness_worst  compactness_worst  concavity_worst  \
1      1956.0            0.1238             0.1866           0.2416   
2      1709.0            0.1444             0.4245           0.4504   
4      1575.0            0.1374             0.2050           0.4000   
5       741.6            0.1791             0.5249           0.5355   
6      1606.0            0.1442             0.2576           0.3784   

   concave points_worst  symmetry_worst  fractal_dimension_worst  
1                0.1860          0.2750                  0.08902  
2                0.2430          0.3613                  0.08758  
4                0.1625          0.2364                  0.07678  
5                0.1741          0.3985                  0.12440  
6                0.1932          0.3063                  0.08368  

[5 rows x 31 columns]
In [27]:
bc_data.to_csv('C://Users/chand/Downloads/pre-processed-Data.csv', index=False)
In [28]:
print(bc_data.head())
   diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0          1        17.99         10.38          122.80     1001.0   
1          1        20.57         17.77          132.90     1326.0   
2          1        19.69         21.25          130.00     1203.0   
3          1        11.42         20.38           77.58      386.1   
4          1        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   symmetry_mean  ...  texture_worst  perimeter_worst  area_worst  \
0         0.2419  ...          17.33           184.60      2019.0   
1         0.1812  ...          23.41           158.80      1956.0   
2         0.2069  ...          25.53           152.50      1709.0   
3         0.2597  ...          26.50            98.87       567.7   
4         0.1809  ...          16.67           152.20      1575.0   

   smoothness_worst  compactness_worst  concavity_worst  concave points_worst  \
0            0.1622             0.6656           0.7119                0.2654   
1            0.1238             0.1866           0.2416                0.1860   
2            0.1444             0.4245           0.4504                0.2430   
3            0.2098             0.8663           0.6869                0.2575   
4            0.1374             0.2050           0.4000                0.1625   

   symmetry_worst  fractal_dimension_worst  outlier  
0          0.4601                  0.11890       -1  
1          0.2750                  0.08902        1  
2          0.3613                  0.08758        1  
3          0.6638                  0.17300       -1  
4          0.2364                  0.07678        1  

[5 rows x 32 columns]

Feature Scaling¶

In [29]:
scaler = StandardScaler()
features = bc_data.drop(columns=[ 'diagnosis'])
scaled_features = scaler.fit_transform(features)
In [30]:
# Convert the scaled features back to a DataFrame

scaled_features_df = pd.DataFrame(scaled_features, columns=features.columns)

Feature Selection using RFE¶

In [31]:
model = RandomForestClassifier(random_state = 42)
rfe = RFE(estimator=model, n_features_to_select=10)
rfe = rfe.fit(scaled_features_df, bc_data['diagnosis'])
In [32]:
# Get the selected features
selected_features = scaled_features_df.columns[rfe.support_]
selected_features
Out[32]:
Index(['perimeter_mean', 'area_mean', 'concavity_mean', 'concave points_mean',
       'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
       'concavity_worst', 'concave points_worst'],
      dtype='object')
In [33]:
# Extract the selected features from the previous RFE process
selected_features = scaled_features_df.columns[rfe.support_]

# Prepare the feature importance data for the selected features
selected_importances = rfe.estimator_.feature_importances_
selected_feature_importance_df = pd.DataFrame({
    'Feature': selected_features,
    'Importance': selected_importances
}).sort_values(by='Importance', ascending=False)

# Horizontal Bar Plot of Selected Feature Importances
plt.figure(figsize=(14, 6))
sns.barplot(x='Importance', y='Feature', data=selected_feature_importance_df, palette='viridis', orient='h')
plt.title('Selected Feature Importances after RFE', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.show()
No description has been provided for this image

Feature Engineering using Polynomial Feature¶

In [34]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(scaled_features_df[selected_features])

# Convert the polynomial features to a DataFrame
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(selected_features))
In [35]:
# Combining the polynomial features and target variable into one DataFrame

data_poly = pd.concat([X_poly_df, bc_data['diagnosis'].reset_index(drop=True)], axis=1)
data_poly
Out[35]:
perimeter_mean area_mean concavity_mean concave points_mean radius_worst texture_worst perimeter_worst area_worst concavity_worst concave points_worst ... perimeter_worst area_worst perimeter_worst concavity_worst perimeter_worst concave points_worst area_worst^2 area_worst concavity_worst area_worst concave points_worst concavity_worst^2 concavity_worst concave points_worst concave points_worst^2 diagnosis
0 1.269934 0.984375 2.652874 2.532475 1.886690 -1.359293 2.303601 2.001237 2.109526 2.296076 ... 4.610052 4.859506 5.289242 4.004951 4.221663 4.594994 4.450101 4.843633 5.271966 1
1 1.685955 1.908708 -0.023846 0.548144 1.805927 -0.369203 1.535126 1.890489 -0.146749 1.087084 ... 2.902139 -0.225278 1.668811 3.573949 -0.277427 2.055121 0.021535 -0.159528 1.181752 1
2 1.566503 1.558884 1.363478 2.037231 1.511870 -0.023974 1.347475 1.456285 0.854974 1.955000 ... 1.962307 1.152056 2.634315 2.120765 1.245085 2.847037 0.730980 1.671474 3.822026 1
3 -0.592687 -0.764464 1.915897 1.451707 -0.281464 0.133984 -0.249939 -0.550021 1.989588 2.175786 ... 0.137472 -0.497276 -0.543814 0.302523 -1.094316 -1.196728 3.958461 4.328918 4.734045 1
4 1.776573 1.826229 1.371011 1.428493 1.298575 -1.466770 1.338539 1.220724 0.613179 0.729259 ... 1.633988 0.820764 0.976142 1.490168 0.748522 0.890224 0.375988 0.447166 0.531819 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
564 2.060786 2.343856 1.947285 2.320965 1.901185 0.117700 1.752563 2.015301 0.664512 1.629151 ... 3.531941 1.164599 2.855189 4.061437 1.339192 3.283230 0.441577 1.082591 2.654134 1
565 1.615931 1.723842 0.693043 1.263669 1.536720 2.047399 1.421940 1.494959 0.236573 0.733827 ... 2.125741 0.336393 1.043458 2.234901 0.353667 1.097041 0.055967 0.173604 0.538502 1
566 0.672676 0.577953 0.046588 0.105777 0.561361 1.374854 0.579001 0.427906 0.326767 0.414069 ... 0.247758 0.189198 0.239746 0.183103 0.139825 0.177182 0.106776 0.135304 0.171453 1
567 1.982524 1.735218 3.296944 2.658866 1.961239 2.237926 2.303601 1.653171 3.197605 2.289985 ... 3.808245 7.366004 5.275212 2.732974 5.286187 3.785737 10.224676 7.322468 5.244034 1
568 -1.814389 -1.347789 -1.114873 -1.261820 -1.410893 0.764190 -1.432735 -1.075813 -1.305831 -1.745063 ... 1.541355 1.870909 2.500212 1.157373 1.404829 1.877361 1.705194 2.278757 3.045244 0

569 rows × 66 columns

In [36]:
# Plot pairplot for a subset of polynomial features

import matplotlib.patches as mpatches

sns.pairplot(data_poly[['perimeter_worst', 'area_worst', 'perimeter_worst^2', 'area_worst^2', 'diagnosis']], hue='diagnosis')

plt.savefig('pairplot.png', dpi=300, bbox_inches='tight') 
plt.show()
No description has been provided for this image

Data Balancing¶

In [37]:
# Separate the majority and minority classes
majority_class = data_poly[data_poly['diagnosis'] == 0]
minority_class = data_poly[data_poly['diagnosis'] == 1]

# Over-sample the minority class
minority_class_over = resample(minority_class, 
                               replace=True,  
                               n_samples=len(majority_class),  
                               random_state=42)  

# Combine the majority class with the over-sampled minority class
data_balanced_over = pd.concat([majority_class, minority_class_over])

# Split the balanced dataset into features and target variable
X_balanced_over = data_balanced_over.drop(columns=['diagnosis'])
y_balanced_over = data_balanced_over['diagnosis']

# Split the balanced data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced_over, y_balanced_over, test_size=0.3, random_state=42)
print(y_train.value_counts())
diagnosis
1    259
0    240
Name: count, dtype: int64
In [38]:
# Distribution of the target variable before and after balancing
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
sns.countplot(x=bc_data['diagnosis'], palette='viridis')
plt.title('Distribution of Target Variable Before Balancing')
plt.xlabel('Diagnosis')
plt.ylabel('Count')

plt.subplot(1, 2, 2)
sns.countplot(x=y_balanced_over, palette='viridis')
plt.title('Distribution of Target Variable After Balancing')
plt.xlabel('Diagnosis')
plt.ylabel('Count')

plt.tight_layout()
plt.show()
No description has been provided for this image

Split Data¶

In [39]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced_over, y_balanced_over, test_size=0.3, random_state=42)

print("Shape of training data:", X_train.shape)
print("Shape of testing data:", X_test.shape)
Shape of training data: (499, 65)
Shape of testing data: (215, 65)

Support Vector Machine (SVM)¶

In [40]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Set random seed for reproducibility
np.random.seed(42)

# Define the parameter grid for SVM
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'linear']
}

# Initialize and fit the grid search
grid_search = GridSearchCV(estimator=SVC(probability=True, random_state=42),
                           param_grid=param_grid_svm, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Print best parameters and best cross-validation accuracy
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Getting the best SVM model
best_svm_model = grid_search.best_estimator_

# Evaluate the best model on the training data
y_train_pred = best_svm_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", train_accuracy)

# Evaluate the best model on the testing data
y_test_pred = best_svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Testing Accuracy:", test_accuracy)
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best parameters: {'C': 100, 'gamma': 1, 'kernel': 'linear'}
Best cross-validation accuracy: 0.9720
Training Accuracy: 1.0
Testing Accuracy: 0.9395348837209302
In [41]:
# Print classification report and confusion matrix for test data
print("SVM Classification Report:\n", classification_report(y_test, y_test_pred))

# Calculate confusion matrix and sensitivity (recall) and specificity

conf_matrix_svm = confusion_matrix(y_test, y_test_pred)
sensitivity_svm = conf_matrix_svm[1, 1] / (conf_matrix_svm[1, 0] + conf_matrix_svm[1, 1])
specificity_svm = conf_matrix_svm[0, 0] / (conf_matrix_svm[0, 0] + conf_matrix_svm[0, 1])

# Print sensitivity, specificity, and confusion matrix

print("Sensitivity (Recall) for Class 1:", sensitivity_svm)
print("Specificity (True Negative Rate) for Class 0:", specificity_svm)
SVM Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.93      0.94       117
           1       0.92      0.95      0.93        98

    accuracy                           0.94       215
   macro avg       0.94      0.94      0.94       215
weighted avg       0.94      0.94      0.94       215

Sensitivity (Recall) for Class 1: 0.9489795918367347
Specificity (True Negative Rate) for Class 0: 0.9316239316239316
In [42]:
# Plot confusion matrix 

disp_svm = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_svm)
disp_svm.plot(cmap='Blues')
plt.title('SVM Confusion Matrix')
plt.show()
No description has been provided for this image
In [43]:
# Predict probabilities and compute ROC curve and AUC
y_prob_svm = best_svm_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob_svm)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='orange', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for SVM')
plt.legend(loc="lower right")
plt.show()
No description has been provided for this image

#LIME¶

In [44]:
import lime
import lime.lime_tabular
In [45]:
# Create a LIME explainer
explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=X_train.values,
    feature_names=X_train.columns,
    class_names=['Benign', 'Malignant'],
    mode='classification'
)
# Explain a prediction
i = 0  # Change this index to explain different predictions
exp = explainer.explain_instance(
    data_row=X_test.values[i],
    predict_fn=best_svm_model.predict_proba
)
In [46]:
# Show the explanation
exp.show_in_notebook(show_table=True, show_all=False)
In [47]:
fig = exp.as_pyplot_figure()
plt.show()
No description has been provided for this image

Multi-Layer Perceptron (MLP)¶

In [48]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

# Train MLP model
mlp_model = MLPClassifier(random_state=42, max_iter=300)
mlp_model.fit(X_train, y_train)

# Predict and evaluate the MLP model
y_pred_mlp = mlp_model.predict(X_test)
In [49]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_mlp)

classification_report_mlp = classification_report(y_test, y_pred_mlp, output_dict=True)

# Calculate sensitivity (recall) and specificity
conf_matrix_mlp = confusion_matrix(y_test, y_pred_mlp)
sensitivity = conf_matrix_mlp[1, 1] / (conf_matrix_mlp[1, 0] + conf_matrix_mlp[1, 1])
specificity = conf_matrix_mlp[0, 0] / (conf_matrix_mlp[0, 0] + conf_matrix_mlp[0, 1])
In [52]:
# Print classification report, sensitivity, specificity, and accuracy
# Classification report
print("MLP Classification Report:")
print(classification_report(y_test, y_pred_mlp))

# Accuracy score
print("MLP Accuracy:", accuracy_score(y_test, y_pred_mlp))
print("Sensitivity (Recall) for Class 1:", sensitivity)
print("Specificity (True Negative Rate) for Class 0:", specificity)

# Print and display the confusion matrix
print(" MLP Confusion Matrix:")
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_mlp)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()
MLP Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       117
           1       0.95      0.95      0.95        98

    accuracy                           0.95       215
   macro avg       0.95      0.95      0.95       215
weighted avg       0.95      0.95      0.95       215

MLP Accuracy: 0.9534883720930233
Sensitivity (Recall) for Class 1: 0.9489795918367347
Specificity (True Negative Rate) for Class 0: 0.9572649572649573
 MLP Confusion Matrix:
No description has been provided for this image
In [54]:
# Predict probabilities
y_prob_mlp = mlp_model.predict_proba(X_test)[:, 1]

# Compute ROC curve and ROC area
fpr, tpr, _ = roc_curve(y_test, y_prob_mlp)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='green', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for MLP')
plt.legend(loc="lower right")
plt.show()
No description has been provided for this image

Random Forest¶

In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Set random seed for reproducibility
np.random.seed(42)

# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],       # Number of trees in the forest
    'max_features': ['auto', 'sqrt'],      # Number of features to consider at every split
    'max_depth': [10, 20, 30, None],       # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],       # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],         # Minimum number of samples required at a leaf node
    'bootstrap': [True, False]             # Whether bootstrap samples are used when building trees
}

# Initialize and fit the grid search
rf_grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                              param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2)
rf_grid_search.fit(X_train, y_train)

# Print best parameters and best cross-validation accuracy
print(f"Best parameters: {rf_grid_search.best_params_}")
print(f"Best cross-validation accuracy: {rf_grid_search.best_score_:.4f}")

# Get the best Random Forest model
best_rf_model = rf_grid_search.best_estimator_

# Evaluate the best model on the training data
y_train_pred_rf = best_rf_model.predict(X_train)
train_accuracy_rf = accuracy_score(y_train, y_train_pred_rf)
print("Training Accuracy:", train_accuracy_rf)

# Evaluate the best model on the testing data
y_test_pred_rf = best_rf_model.predict(X_test)
test_accuracy_rf = accuracy_score(y_test, y_test_pred_rf)
print("Testing Accuracy:", test_accuracy_rf)
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
Best parameters: {'bootstrap': False, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}
Best cross-validation accuracy: 0.9860
Training Accuracy: 1.0
Testing Accuracy: 0.958139534883721
In [56]:
# Print classification report and confusion matrix for test data

print("Random Forest Classification Report:\n", classification_report(y_test, y_test_pred_rf))

# Calculate confusion matrix and sensitivity (recall) and specificity

conf_matrix_rf = confusion_matrix(y_test, y_test_pred_rf)
sensitivity_rf = conf_matrix_rf[1, 1] / (conf_matrix_rf[1, 0] + conf_matrix_rf[1, 1])
specificity_rf = conf_matrix_rf[0, 0] / (conf_matrix_rf[0, 0] + conf_matrix_rf[0, 1])

# Print sensitivity, specificity, and confusion matrix

print("Sensitivity (Recall) for Class 1:", sensitivity_rf)
print("Specificity (True Negative Rate) for Class 0:", specificity_rf)
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.96       117
           1       0.96      0.95      0.95        98

    accuracy                           0.96       215
   macro avg       0.96      0.96      0.96       215
weighted avg       0.96      0.96      0.96       215

Sensitivity (Recall) for Class 1: 0.9489795918367347
Specificity (True Negative Rate) for Class 0: 0.9658119658119658
In [57]:
# Plot confusion matrix

disp_rf = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_rf)
disp_rf.plot(cmap='Blues')
plt.title('Random Forest Confusion Matrix')
plt.show()
No description has been provided for this image

ROC Curve¶

In [58]:
# Get predicted probabilities for the positive class (class 1)
y_test_proba_rf = best_rf_model.predict_proba(X_test)[:, 1]

# Compute ROC curve and AUC score
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_test_proba_rf)
roc_auc_rf = roc_auc_score(y_test, y_test_proba_rf)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr_rf, tpr_rf, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc_rf:.4f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')  # Diagonal line (random model)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Random Forest Model')
plt.legend(loc="lower right")
plt.grid()
plt.show()
No description has been provided for this image

XGBOOST¶

In [62]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc, classification_report, accuracy_score
In [63]:
# Define the parameter grid for XGBoost
param_grid_xgb = {
    'objective': ['binary:logistic'],
    'eval_metric': ['logloss'],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 9],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
# Initialize XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)

# Initialize GridSearchCV
grid_search_xgb = GridSearchCV(
    estimator=xgb_model, param_grid=param_grid_xgb, scoring='accuracy', cv=5, n_jobs=-1, verbose=2)
# Fit GridSearchCV
grid_search_xgb.fit(X_train, y_train)
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Out[63]:
GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, device=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     grow_policy=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     missing=nan, monotone_constraints=None,
                                     multi_strategy=None, n_estimators=None,
                                     n_jobs=None, num_parallel_tree=None,
                                     random_state=42, ...),
             n_jobs=-1,
             param_grid={'colsample_bytree': [0.8, 1.0],
                         'eval_metric': ['logloss'],
                         'learning_rate': [0.01, 0.1, 0.2],
                         'max_depth': [3, 6, 9],
                         'n_estimators': [100, 200, 300],
                         'objective': ['binary:logistic'],
                         'subsample': [0.8, 1.0]},
             scoring='accuracy', verbose=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, device=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     grow_policy=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None,...
                                     missing=nan, monotone_constraints=None,
                                     multi_strategy=None, n_estimators=None,
                                     n_jobs=None, num_parallel_tree=None,
                                     random_state=42, ...),
             n_jobs=-1,
             param_grid={'colsample_bytree': [0.8, 1.0],
                         'eval_metric': ['logloss'],
                         'learning_rate': [0.01, 0.1, 0.2],
                         'max_depth': [3, 6, 9],
                         'n_estimators': [100, 200, 300],
                         'objective': ['binary:logistic'],
                         'subsample': [0.8, 1.0]},
             scoring='accuracy', verbose=2)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=1.0, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.1, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=100,
              n_jobs=None, num_parallel_tree=None, random_state=42, ...)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=1.0, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.1, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=100,
              n_jobs=None, num_parallel_tree=None, random_state=42, ...)
In [64]:
# Get the best XGBoost model
best_xgb_model = grid_search_xgb.best_estimator_
print(f"Best parameters: {grid_search_xgb.best_params_}")
print(f"Best cross-validation accuracy: {grid_search_xgb.best_score_:.4f}")

# Predict on the test set
y_pred_best_xgb = best_xgb_model.predict(X_test)
y_prob_best_xgb = best_xgb_model.predict_proba(X_test)[:, 1]  # Probability estimates for ROC curve
accuracy = accuracy_score(y_test, y_pred_best_xgb)
print("XGBoost Accuracy:", accuracy)
Best parameters: {'colsample_bytree': 1.0, 'eval_metric': 'logloss', 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'objective': 'binary:logistic', 'subsample': 1.0}
Best cross-validation accuracy: 0.9800
XGBoost Accuracy: 0.9627906976744186
In [65]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best_xgb))
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       117
           1       0.96      0.96      0.96        98

    accuracy                           0.96       215
   macro avg       0.96      0.96      0.96       215
weighted avg       0.96      0.96      0.96       215

In [66]:
# Compute confusion matrix
conf_matrix_xgb = confusion_matrix(y_test, y_pred_best_xgb)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_xgb, display_labels=['Class 0', 'Class 1'])
disp.plot(cmap='Blues')
plt.title(' XGBoost Confusion Matrix')
plt.show()
No description has been provided for this image
In [67]:
# Compute ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob_best_xgb)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()
No description has been provided for this image

XAI - SHAP¶

In [68]:
explainer = shap.Explainer(best_xgb_model, X_train)
shap_values = explainer(X_test)
In [69]:
shap.summary_plot(shap_values, X_test)
# Save the plot to a file
plt.savefig('xgboostplot.png', dpi=300, bbox_inches='tight') 
plt.show()
No description has been provided for this image
<Figure size 640x480 with 0 Axes>
In [70]:
shap.dependence_plot("perimeter_worst", shap_values.values, X_test)
No description has been provided for this image
In [71]:
#force plot
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values.values[0], X_test.iloc[0, :])
No description has been provided for this image
Out[71]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [72]:
#waterfall plot

#Breaks down an individual prediction to show how each feature contributes to the final prediction.
shap.waterfall_plot(shap.Explanation(values=shap_values[0], base_values=explainer.expected_value, data=X_test.iloc[0]))
No description has been provided for this image

CATBOOST¶

In [73]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay

# Train CatBoost model
catboost_model = CatBoostClassifier(random_state=42, verbose=0)
catboost_model.fit(X_train, y_train)

# Predict and evaluate the CatBoost model
y_pred_catboost = catboost_model.predict(X_test)
In [74]:
# Classification report
classification_report_catboost = classification_report(y_test, y_pred_catboost, output_dict=True)

# Calculate sensitivity (recall) and specificity

sensitivity = classification_report_catboost['1']['recall']
conf_matrix_catboost = confusion_matrix(y_test, y_pred_catboost)
specificity = conf_matrix_catboost[0, 0] / (conf_matrix_catboost[0, 0] + conf_matrix_catboost[0, 1])
accuracy = accuracy_score(y_test, y_pred_catboost)
In [75]:
# Print classification report, sensitivity, specificity, and accuracy
print("CatBoost Classification Report:")
print(classification_report(y_test, y_pred_catboost))
print("CatBoost Accuracy:", accuracy)
print("Sensitivity (Recall) for Class 1:", sensitivity)
print("Specificity (True Negative Rate) for Class 0:", specificity)

# Print confusion matrix
print("Confusion Matrix:")
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_catboost)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()
CatBoost Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97       117
           1       0.97      0.95      0.96        98

    accuracy                           0.96       215
   macro avg       0.96      0.96      0.96       215
weighted avg       0.96      0.96      0.96       215

CatBoost Accuracy: 0.9627906976744186
Sensitivity (Recall) for Class 1: 0.9489795918367347
Specificity (True Negative Rate) for Class 0: 0.9743589743589743
Confusion Matrix:
No description has been provided for this image
In [76]:
# Predict probabilities
y_prob_catboost = catboost_model.predict_proba(X_test)[:, 1]

# Compute ROC curve and ROC area
fpr, tpr, _ = roc_curve(y_test, y_prob_catboost)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='purple', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for CatBoost')
plt.legend(loc="lower right")
plt.show()
No description has been provided for this image

SHAP - CATBOOST¶

In [77]:
explainer1 = shap.TreeExplainer(catboost_model)
shap_values1 = explainer.shap_values(X_test)
In [78]:
if isinstance(shap_values, list):
    shap_values1 = shap_values[0]
In [79]:
# Summary plot
shap.summary_plot(shap_values1, X_test)
No description has been provided for this image
In [80]:
# Dependence plot for a specific feature

shap.dependence_plot("area_worst", shap_values1, X_test)
No description has been provided for this image
In [81]:
# Calculating SHAP interaction values

explainer_catboost = shap.TreeExplainer(catboost_model)
shap_interaction_values_catboost = explainer_catboost.shap_interaction_values(X_test)
In [82]:
# Interaction value matrix plot

shap.summary_plot(shap_interaction_values_catboost, X_test,  plot_type="interaction")
No description has been provided for this image

Stacking Classifier - Meta-model¶

#Defining all the base models¶

In [107]:
from xgboost import XGBClassifier
# Define base models
base_models = [
    ('MLP', MLPClassifier(random_state=42, max_iter=300)),
    ('XGBoost', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')),
    ('CatBboost', CatBoostClassifier(random_state=42, verbose=0))
]
In [108]:
# Define meta-learner

meta_learner = LogisticRegression()
In [109]:
# Creating stacking classifier

stacking_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_learner,
    cv=5  # Cross-validation folds for stacking
)
In [110]:
# Training the stacking classifier

stacking_clf.fit(X_train, y_train)
Out[110]:
StackingClassifier(cv=5,
                   estimators=[('MLP',
                                MLPClassifier(max_iter=300, random_state=42)),
                               ('XGBoost',
                                XGBClassifier(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric='mlogloss',
                                              feature_types=None, gamma=None,
                                              grow_poli...
                                              max_cat_threshold=None,
                                              max_cat_to_onehot=None,
                                              max_delta_step=None,
                                              max_depth=None, max_leaves=None,
                                              min_child_weight=None,
                                              missing=nan,
                                              monotone_constraints=None,
                                              multi_strategy=None,
                                              n_estimators=None, n_jobs=None,
                                              num_parallel_tree=None,
                                              random_state=42, ...)),
                               ('CatBboost',
                                <catboost.core.CatBoostClassifier object at 0x0000029A741B6D10>)],
                   final_estimator=LogisticRegression())
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StackingClassifier(cv=5,
                   estimators=[('MLP',
                                MLPClassifier(max_iter=300, random_state=42)),
                               ('XGBoost',
                                XGBClassifier(base_score=None, booster=None,
                                              callbacks=None,
                                              colsample_bylevel=None,
                                              colsample_bynode=None,
                                              colsample_bytree=None,
                                              device=None,
                                              early_stopping_rounds=None,
                                              enable_categorical=False,
                                              eval_metric='mlogloss',
                                              feature_types=None, gamma=None,
                                              grow_poli...
                                              max_cat_threshold=None,
                                              max_cat_to_onehot=None,
                                              max_delta_step=None,
                                              max_depth=None, max_leaves=None,
                                              min_child_weight=None,
                                              missing=nan,
                                              monotone_constraints=None,
                                              multi_strategy=None,
                                              n_estimators=None, n_jobs=None,
                                              num_parallel_tree=None,
                                              random_state=42, ...)),
                               ('CatBboost',
                                <catboost.core.CatBoostClassifier object at 0x0000029A741B6D10>)],
                   final_estimator=LogisticRegression())
MLPClassifier(max_iter=300, random_state=42)
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='mlogloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, random_state=42, ...)
<catboost.core.CatBoostClassifier object at 0x0000029A741B6D10>
LogisticRegression()
In [111]:
# Predict and evaluate the stacking classifier

y_pred_stacking = stacking_clf.predict(X_test)
y_prob_stacking = stacking_clf.predict_proba(X_test)[:, 1]
In [112]:
# Classification report
print("Stacking Classifier Classification Report:")
print(classification_report(y_test, y_pred_stacking))

# Accuracy score
accuracy = accuracy_score(y_test, y_pred_stacking)
print("Stacking Classifier Accuracy:", accuracy)

# Calculate sensitivity (recall) and specificity
conf_matrix_stacking = confusion_matrix(y_test, y_pred_stacking)
sensitivity = conf_matrix_stacking[1, 1] / (conf_matrix_stacking[1, 0] + conf_matrix_stacking[1, 1])
specificity = conf_matrix_stacking[0, 0] / (conf_matrix_stacking[0, 0] + conf_matrix_stacking[0, 1])
print("Sensitivity (Recall) for Class 1:", sensitivity)
print("Specificity (True Negative Rate) for Class 0:", specificity)

# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix_stacking)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_stacking)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()
Stacking Classifier Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97       117
           1       0.97      0.95      0.96        98

    accuracy                           0.96       215
   macro avg       0.96      0.96      0.96       215
weighted avg       0.96      0.96      0.96       215

Stacking Classifier Accuracy: 0.9627906976744186
Sensitivity (Recall) for Class 1: 0.9489795918367347
Specificity (True Negative Rate) for Class 0: 0.9743589743589743
Confusion Matrix:
[[114   3]
 [  5  93]]
No description has been provided for this image
In [113]:
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='purple', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Stacking Classifier')
plt.legend(loc="lower right")
plt.show()
No description has been provided for this image

PDP¶

In [114]:
from sklearn.inspection import PartialDependenceDisplay
In [115]:
# For a single feature

PartialDependenceDisplay.from_estimator(stacking_clf, X_train, [2], feature_names=X_train.columns)
plt.show()
No description has been provided for this image
In [116]:
# Define colors for each feature
colors = ['red', 'green', 'blue']

# Create PDP plots with different colors
fig, axs = plt.subplots(1, len(features), figsize=(15, 5))

features = [6, 3, 7]  # Indices of features to plot

for i, (feature, color) in enumerate(zip(features, colors)):
    PartialDependenceDisplay.from_estimator(stacking_clf, X_train, [feature], feature_names=X_train.columns, ax=axs[i], line_kw={'color': color})

plt.tight_layout()
plt.show()
No description has been provided for this image
In [117]:
# For interaction between two features

PartialDependenceDisplay.from_estimator(stacking_clf, X_train, [(6, 3)], feature_names=X_train.columns)
plt.show()
No description has been provided for this image
In [118]:
import lime
from lime.lime_tabular import LimeTabularExplainer

explainer = LimeTabularExplainer(X_train.values, feature_names=X_train.columns, class_names=['0', '1'], mode='classification')
lime_exp = explainer.explain_instance(X_test.iloc[0].values, stacking_clf.predict_proba)
lime_exp.show_in_notebook()